Setup

local <- T #commandArgs(trailingOnly=T)
# Settings for running locally vs. on computer cluster 
if (local==T){ 
  nCores = parallel::detectCores() 
} else { 
  nCores = parallel::detectCores()#*4  ## Defaults to 1/4 of all cores 
} 
# Gather parameters from command line  
#dir.create(file.path(resultsPath,"cache"), showWarnings=F, recursive=T)  

# Have to setwd via knitr
# knitr::opts_knit$set(root.dir=resultsPath, child.path = resultsPath)
knitr::opts_chunk$set(echo=T, error=T#, root.dir = resultsPath#cache=T, cache.lazy=T,     
                      ) 
 
kableStyle = c("striped", "hover", "condensed", "responsive") 
# Utilize parallel processing later on 
print(paste("**** Utilized Cores **** =", parallel::detectCores() ))  
## [1] "**** Utilized Cores **** = 4"
print(paste("Saving results to: ", params$resultsPath))
## [1] "Saving results to:  /Users/schilder/Desktop/PD_scRNAseq/Results/subsetGenes-protein_coding__subsetCells-400__Resolution-0.6"

** subsetGenes-protein_coding__subsetCells-400__Resolution-0.6 **

Load Libraries & Report Versions

library(Seurat)
library(dplyr)
library(gridExtra)
library(knitr)
library(kableExtra) 
library(plotly) 
library(reshape2)
library(shiny)
library(heatmaply) 
# 
# install.packages('devtools')
# devtools::install_github('talgalili/heatmaply')
## Install Bioconductor
#  if (!requireNamespace("BiocManager"))
#     install.packages("BiocManager")
# BiocManager::install(c("biomaRt"))
library(biomaRt) 

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.1
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] biomaRt_2.38.0    heatmaply_0.15.3  viridis_0.5.1    
##  [4] viridisLite_0.3.0 shiny_1.2.0       reshape2_1.4.3   
##  [7] plotly_4.8.0      kableExtra_0.9.0  knitr_1.21       
## [10] gridExtra_2.3     dplyr_0.7.8       Seurat_2.3.4     
## [13] Matrix_1.2-15     cowplot_0.9.3     ggplot2_3.1.0    
## 
## loaded via a namespace (and not attached):
##   [1] snow_0.4-3           backports_1.1.3      Hmisc_4.1-1         
##   [4] plyr_1.8.4           igraph_1.2.2         lazyeval_0.2.1      
##   [7] splines_3.5.1        digest_0.6.18        foreach_1.4.4       
##  [10] htmltools_0.3.6      lars_1.2             gdata_2.18.0        
##  [13] memoise_1.1.0        magrittr_1.5         checkmate_1.8.5     
##  [16] cluster_2.0.7-1      mixtools_1.1.0       ROCR_1.0-7          
##  [19] gclus_1.3.1          readr_1.3.1          R.utils_2.7.0       
##  [22] prettyunits_1.0.2    colorspace_1.3-2     blob_1.1.1          
##  [25] rvest_0.3.2          xfun_0.4             crayon_1.3.4        
##  [28] RCurl_1.95-4.11      jsonlite_1.6         bindr_0.1.1         
##  [31] survival_2.43-3      zoo_1.8-4            iterators_1.0.10    
##  [34] ape_5.2              glue_1.3.0           registry_0.5        
##  [37] gtable_0.2.0         webshot_0.5.1        kernlab_0.9-27      
##  [40] BiocGenerics_0.28.0  prabclus_2.2-6       DEoptimR_1.0-8      
##  [43] scales_1.0.0         mvtnorm_1.0-8        DBI_1.0.0           
##  [46] bibtex_0.4.2         Rcpp_1.0.0           metap_1.0           
##  [49] dtw_1.20-1           progress_1.2.0       xtable_1.8-3        
##  [52] htmlTable_1.12       reticulate_1.10      foreign_0.8-71      
##  [55] bit_1.1-14           proxy_0.4-22         mclust_5.4.2        
##  [58] SDMTools_1.1-221     Formula_1.2-3        stats4_3.5.1        
##  [61] tsne_0.1-3           htmlwidgets_1.3      httr_1.4.0          
##  [64] gplots_3.0.1         RColorBrewer_1.1-2   fpc_2.1-11.1        
##  [67] acepack_1.4.1        modeltools_0.2-22    ica_1.0-2           
##  [70] pkgconfig_2.0.2      XML_3.98-1.16        R.methodsS3_1.7.1   
##  [73] flexmix_2.3-14       nnet_7.3-12          AnnotationDbi_1.44.0
##  [76] tidyselect_0.2.5     rlang_0.3.0.1        later_0.7.5         
##  [79] munsell_0.5.0        tools_3.5.1          RSQLite_2.1.1       
##  [82] ggridges_0.5.1       evaluate_0.12        stringr_1.3.1       
##  [85] yaml_2.2.0           npsurv_0.4-0         bit64_0.9-7         
##  [88] fitdistrplus_1.0-11  robustbase_0.93-3    caTools_1.17.1.1    
##  [91] purrr_0.2.5          RANN_2.6             dendextend_1.9.0    
##  [94] bindrcpp_0.2.2       pbapply_1.3-4        nlme_3.1-137        
##  [97] whisker_0.3-2        mime_0.6             R.oo_1.22.0         
## [100] xml2_1.2.0           hdf5r_1.0.1          compiler_3.5.1      
## [103] rstudioapi_0.8       png_0.1-7            lsei_1.2-0          
## [106] tibble_1.4.2         stringi_1.2.4        lattice_0.20-38     
## [109] trimcluster_0.1-2.1  pillar_1.3.1         Rdpack_0.10-1       
## [112] lmtest_0.9-36        data.table_1.11.8    bitops_1.0-6        
## [115] irlba_2.3.2          seriation_1.2-3      gbRd_0.4-11         
## [118] httpuv_1.4.5.1       R6_2.3.0             latticeExtra_0.6-28 
## [121] promises_1.0.1       TSP_1.1-6            KernSmooth_2.23-15  
## [124] IRanges_2.16.0       codetools_0.2-16     MASS_7.3-51.1       
## [127] gtools_3.8.1         assertthat_0.2.0     withr_2.1.2         
## [130] S4Vectors_0.20.1     diptest_0.75-7       parallel_3.5.1      
## [133] doSNOW_1.0.16        hms_0.4.2            grid_3.5.1          
## [136] rpart_4.1-13         tidyr_0.8.2          class_7.3-14        
## [139] rmarkdown_1.11       segmented_0.5-3.0    Rtsne_0.15          
## [142] Biobase_2.42.0       base64enc_0.1-3
print(paste("Seurat ", packageVersion("Seurat")))
## [1] "Seurat  2.3.4"

Load Data

#setwd("~/Desktop/PD_scRNAseq/")
dir.create(file.path(root,"Data"), showWarnings=F) 
load(file.path(root,"Data/seurat_object_add_HTO_ids.Rdata"))
pbmc <- seurat.obj  
rm(seurat.obj)

Pre-filtered Dimensions

pbmc
## An object of class seurat in project RAJ_13357 
##  24914 genes across 22113 samples.

Clean Metadata

Add Metadata

metadata <- read.table(file.path(root,"Data/meta.data4.tsv"))
kable(head(metadata), caption = "Metadata") %>%  
  kable_styling(bootstrap_options = kableStyle) %>%
  scroll_box(width = "100%", height = "500px")
Metadata
ID nGene nUMI orig.ident singlet.or.not.binary percent.mito res.2 res.1 res.0.6 res.0.3 barcode dx mut Ethnicity Gender Age
AAAGCAAGTTTGTTGG BIMD0007 784 1780 RAJ_13357 1 0.0314607 3 3 1 0 AAAGCAAGTTTGTTGG PD PD White M 59
TCAGCAATCTTGACGA BIMD0007 742 1854 RAJ_13357 1 0.0302213 18 1 0 0 TCAGCAATCTTGACGA PD PD White M 59
AGCTCCTTCGCGTAGC BIMD0007 495 988 RAJ_13357 1 0.0404858 14 7 2 3 AGCTCCTTCGCGTAGC PD PD White M 59
TATTACCCACTCTGTC BIMD0007 812 1874 RAJ_13357 1 0.0469584 16 12 7 6 TATTACCCACTCTGTC PD PD White M 59
CTCGAGGAGCGATTCT BIMD0007 863 2260 RAJ_13357 1 0.0212578 1 0 1 0 CTCGAGGAGCGATTCT PD PD White M 59
ATAAGAGCATCAGTCA BIMD0007 803 2034 RAJ_13357 1 0.0216323 9 8 0 0 ATAAGAGCATCAGTCA PD PD White M 59
# Make AgeGroups
makeAgeGroups <- function(){
  dim(metadata)
  getMaxRound <- function(vals=metadata$Age, unit=10)unit*ceiling((max(vals)/unit))
  getMinRound <- function(vals=metadata$Age, unit=10)unit*floor((min(vals)/unit)) 
   
  ageBreaks = c(seq(getMinRound(), getMaxRound(), by = 10), getMaxRound()+10)
  AgeGroupsUniq <- c()
  for (i in 1:(length(ageBreaks)-1)){ 
    AgeGroupsUniq <- append(AgeGroupsUniq, paste(ageBreaks[i],ageBreaks[i+1], sep="-")) 
  } 
  data.table::setDT(metadata,keep.rownames = T,check.names = F)[, AgeGroups := cut(Age, 
                                  breaks = ageBreaks, 
                                  right = F, 
                                  labels = AgeGroupsUniq,
                                  nclude.lowest=T)]
  metadata <- data.frame(metadata)
  unique(metadata$AgeGroups)
  head(metadata)
  dim(metadata)
  return(metadata)
}
# metadata <- makeAgeGroups()

pbmc <- AddMetaData(object = pbmc, metadata = metadata)  
# Get rid of any NAs (cells that don't match up with the metadata) 
if(subsetCells==F){
  pbmc <- FilterCells(object = pbmc,  subset.names = "nGene", low.thresholds = 0)
} else {pbmc <- FilterCells(object = pbmc,  subset.names = "nGene", low.thresholds = 0,
                    # Subset for testing 
                    cells.use = pbmc@cell.names[0:subsetCells]
                    )
}  

Filter & Normalize Data

Subset Genes by Biotype

Include only subsets of genes by type. Biotypes from: https://useast.ensembl.org/info/genome/genebuild/biotypes.html

## Seurat::FindGeneTerms() # Enrichr API
## Seurat::MultiModal_CCA() # Integrates data from disparate datasets (CIA version too)
if( subsetGenes!=F ){
  print(paste("Subsetting genes:",subsetGenes))
  # If the gene_biotypes file exists, import csv. Otherwise, get from biomaRt
  if(file_test("-f", file.path(root,"Data/gene_biotypes.csv"))){
    biotypes <- read.csv(file.path(root,"Data/gene_biotypes.csv"))
  }
  else {
    ensembl <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org",
                     dataset="hsapiens_gene_ensembl") 
    ensembl <- useDataset(mart = ensembl, dataset = "hsapiens_gene_ensembl")
    listFilters(ensembl)
    listAttributes(ensembl)   
    biotypes <- getBM(attributes=c("hgnc_symbol", "gene_biotype"), filters="hgnc_symbol",
          values=row.names(pbmc@data), mart=ensembl) 
    write.csv(biotypes, file.path(root,"Data/gene_biotypes.csv"), quote=F, row.names=F)
  } 
  # Subset data by creating new Seurat object (annoying but necessary)
  geneSubset <- biotypes[biotypes$gene_biotype==subsetGenes,"hgnc_symbol"] 
  
  print(paste(dim(pbmc@raw.data[geneSubset, ])[1],"/", dim(pbmc@raw.data)[1], 
              "genes are", subsetGenes))
  # Add back into pbmc 
  subset.matrix <- pbmc@raw.data[geneSubset, ] # Pull the raw expression matrix from the original Seurat object containing only the genes of interest
  pbmc_sub <- CreateSeuratObject(subset.matrix) # Create a new Seurat object with just the genes of interest
  orig.ident <- row.names(pbmc@meta.data) # Pull the identities from the original Seurat object as a data.frame
  pbmc_sub <- AddMetaData(object = pbmc_sub, metadata = pbmc@meta.data) # Add the idents to the meta.data slot
  pbmc_sub <- SetAllIdent(object = pbmc_sub, id = "ident") # Assign identities for the new Seurat object
  pbmc <- pbmc_sub
  rm(pbmc_sub) 
} 
## [1] "Subsetting genes: protein_coding"
## [1] "14827 / 24914 genes are protein_coding"

Subset Cells

Filter by cells, normalize , filter by gene variability.

pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"), 
    low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))

pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", 
    scale.factor = 10000)

Subset Genes by Variance

** Important!**: Specify do.par = T, and num.cores = nCores in ‘ScaleData’ to use all available cores.

# Store the top most variable genes in @var.genes
pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR,
    x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)

# IMPORTANT!: Must set do.par=T and num.cors = n for large datasets being processed on computing clusters
pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"), do.par = T, num.cores = nCores)
## Regressing out: nUMI, percent.mito
## 
## Time Elapsed:  7.74281883239746 secs
## Scaling data matrix

Filtered Dimensions

pbmc
## An object of class seurat in project SeuratProject 
##  14827 genes across 396 samples.

Diagnostic Plots

Violin Plots

vp <- VlnPlot(object = pbmc, features.plot = c("nGene", "nUMI", "percent.mito"),nCol = 3, do.return = T) %>% + ggplot2::aes(alpha=0.5)
vp

Gene Plots

percent.mito plot

# par(mfrow = c(1, 2))
gp1 <- GenePlot(object = pbmc, gene1 = "nUMI", gene2 = "percent.mito", pch.use=20, 
         do.hover=T, data.hover = "mut")
gp1

nGene plot

gp2 <- GenePlot(object = pbmc, gene1 = "nUMI", gene2 = "nGene", pch.use=20, 
         do.hover=T, data.hover = "mut")
gp2

Dimensionality Reduction

PCA

ProjectPCA scores each gene in the dataset (including genes not included in the PCA) based on their correlation with the calculated components. Though we don’t use this further here, it can be used to identify markers that are strongly correlated with cellular heterogeneity, but may not have passed through variable gene selection. The results of the projected PCA can be explored by setting use.full=T in the functions above

  • Other Dim Reduction Methods in Seurat
  • RunCCA()
  • RunMultiCCA()
  • RunDiffusion()
  • RunPHATE()
  • RunUMAP()
  • RunICA()
# Run PCA with only the top most variables genes
pbmc <- RunPCA(object = pbmc, pc.genes = pbmc@var.genes, do.print=F)
  #, pcs.print = 1:5,  genes.print = 5

VizPCA

VizPCA(object = pbmc, pcs.use = 1:2)

PCA plot

PCAPlot(object = pbmc, dim.1 = 1, dim.2 = 2, do.hover=T, data.hover="mut")

PCHeatmaps

pbmc <- ProjectPCA(object = pbmc, do.print=F)

cells.use  <- ifelse(subsetCells==F, 500, subsetCells) 
# 'PCHeatmap' is a wrapper for heatmap.2 
# PCA Heatmap: PC1-PCn
pcp <- PCHeatmap(object = pbmc, pc.use = 1:12, cells.use = cells.use, do.balanced=T, 
    label.columns=F, use.full=F, do.return = T) 
## Error in dim.scores[cells.use, paste0(dim.key, ndim)]: subscript out of bounds
# 
# PCHeatmap_interactive <- function(PC=1){  
#   PC_dat <- PCHeatmap(object = pbmc, pc.use = PC, do.return = T) 
#   # Cluster samples
#   Xclust <- pcp %>% dist(upper = T) %>% hclust() 
#   Yclust <- PC_dat %>% t() %>% dist(upper = T) %>% hclust() 
#   PC_dat <- PC_dat[Xclust$order, Yclust$order]  
#   # Plotly 
#   plot_ly(y=row.names(PC_dat), z=matrix(PC_dat), type = "heatmap", 
#           colors =viridis::plasma(n=100))
# }
# PCHeatmap_interactive(PC=1)   

Significant PCs

Determine statistically significant PCs for further analysis. NOTE: This process can take a long time for big datasets, comment out for expediency. More approximate techniques such as those implemented in PCElbowPlot() can be used to reduce computation time

#pbmc <- JackStraw(object = pbmc, num.replicate = 100, display.progress = FALSE)
PCElbowPlot(object = pbmc)

Find Cell Clusters

We first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To cluster the cells, we apply modularity optimization techniques such as the Louvain algorithm (default) or SLM [SLM, Blondel et al., Journal of Statistical Mechanics], to iteratively group cells together, with the goal of optimizing the standard modularity function.

On Resolution
The FindClusters function implements the procedure, and contains a resolution parameter that sets the ‘granularity’ of the downstream clustering, with increased values leading to a greater number of clusters. We find that setting this parameter between 0.6-1.2 typically returns good results for single cell datasets of around 3K cells. Optimal resolution often increases for larger datasets. The clusters are saved in the object@ident slot.

# TRY DIFFERENT RESOLUTIONS
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10, 
    resolution = resolution, print.output = 0, save.SNN = T) 
## Error in RunModularityClusteringCpp(SNN, modularity, resolution, algorithm, : Not compatible with requested type: [type=character; target=double].
PrintFindClustersParams(object = pbmc) 
## Error in PrintFindClustersParams(object = pbmc): No stored clusterings.

t-SNE

As input to the tSNE, we suggest using the same PCs as input to the clustering analysis, although computing the tSNE based on scaled gene expression is also supported using the genes.use argument.

** Important!**: Specify num_threads=0 in ‘RunTSNE’ to use all available cores.

labSize <- 6
#pbmc <- StashIdent(object = pbmc, save.name = "pre_clustering") 
#pbmc <- SetAllIdent(object = pbmc, id = "pre_clustering") 

pbmc <- RunTSNE(object=pbmc,  reduction.use = "pca", dims.use = 1:10, do.fast = TRUE, 
                tsne.method = "Rtsne", num_threads=0) # num_threads
# note that you can set do.label=T to help label individual clusters
TSNEPlot(object = pbmc, do.label=T, label.size = labSize, do.return=T) %>% ggplotly() 

t-SNE + Metadata Plots

tSNE_metadata_plot <- function(var){
  print(paste("t-SNE Metadata plot for ", var))
  # Metadata plot 
  p1 <- TSNEPlot(pbmc, do.return = T,  do.label = T,  group.by = var, pt.size=1,
                 plot.title=paste("Color by ",var), vector.friendly=T) %>% ggplotly() %>% 
     layout(legend = list(orientation = 'h', xanchor = "center", x = 0.5, y = .999)) 
  # t-SNE clusters plot
  p2 <- TSNEPlot(pbmc, do.return = T, do.label = T, pt.size=1,
                 plot.title=paste("Color by Clusters"), vector.friendly=T) %>% ggplotly() %>% 
  layout(legend = list(orientation = 'h', xanchor = "center", x = 0.5, y = .999)) 
  #print(plot_grid(ggplotly(p1), ggplotly(p2)))
  fluidPage( 
    fluidRow(
      column(6, p1), column(6, p2) 
    )
  )
}   
# metaVars <- c(dx","mut","Gender","Age")
# 
# for (var in metaVars){
#   print(paste("t-SNE Metadata plot for ",var))
#   # Metadata plot 
#   p1 <- TSNEPlot(pbmc, do.return = T, pt.size = 0.5, group.by = var, do.label = T, 
#                  dark.theme=F, plot.title=paste("Color by ",var))
#   # t-SNE clusters plot
#   p2 <- TSNEPlot(pbmc, do.label = T, do.return = T, pt.size = 0.5, plot.title=paste("Color by t-SNE clusters"))
#   print(plot_grid(p1, p2))
# }   

Disease

tSNE_metadata_plot("dx") 
## [1] "t-SNE Metadata plot for  dx"

Mutations

tSNE_metadata_plot("mut") 
## [1] "t-SNE Metadata plot for  mut"

Gender

tSNE_metadata_plot("Gender") 
## [1] "t-SNE Metadata plot for  Gender"

Age

tSNE_metadata_plot("Age") 
## [1] "t-SNE Metadata plot for  Age"

Cluster Biomarkers

Seurat has several tests for differential expression which can be set with the test.use parameter (see the DE vignette for details). For example, the ROC test returns the ‘classification power’ for any individual marker (ranging from 0 - random, to 1 - perfect).

### Biomarkers: All Clusters vs. All Other Clusters ***
# find markers for every cluster compared to all remaining cells, report
# only the positive ones
pbmc.markers <- FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
## Cell group 2 is empty - no cells with identity class
## Warning in FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, :
## No DE genes identified.
topBiomarkers <- pbmc.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
## Error in grouped_df_impl(data, unname(vars), drop): Column `cluster` is unknown
kable(topBiomarkers) %>% kable_styling(bootstrap_options = kableStyle) 
## Error in kable(topBiomarkers): object 'topBiomarkers' not found

Cluster Biomarker Tests

getTopBiomarker <- function(pbmc.markers, clusterID, topN=1){
  df <- subset(pbmc.markers, p_val_adj<0.05 & cluster==as.character(clusterID)) %>% arrange(desc(avg_logFC))
  top_pct_markers <- df[1:topN,"gene"]
  return(top_pct_markers)
}
# clust1_biomarkers <- getTopBiomarker(pbmc.markers, clusterID=1, topN=2)
# clust2_biomarkers <- getTopBiomarker(pbmc.markers, clusterID=2, topN=2)


### Plot biomarkers 
plotBiomarkers <- function(pbmc, biomarkers, cluster){
  biomarkerPlots <- list()
  for (marker in biomarkers){ 
    p <- VlnPlot(object = pbmc, features.plot = c(marker), y.log=T, return.plotlist=T) 
    biomarkerPlots[[marker]] <- p + ggplot2::aes(alpha=0.5) + xlab( "Cluster") + ylab( "Expression")
  }
  combinedPlot <- do.call(grid.arrange, c(biomarkerPlots, list(ncol=2, top=paste("Top DEG Biomarkers for Cluster",cluster))) ) 

  # biomarkerPlots <- lapply(biomarkers, function(marker) {
  #   VlnPlot(object = pbmc, features.plot = c(marker), y.log=T, return.plotlist=T) %>% + ggplot2::ggtitle(marker) %>% ggplotly() 
  # })    
  # return(subplot(biomarkerPlots) )
}   

top1 <- pbmc.markers %>% group_by(cluster) %>% top_n(1, avg_logFC) 
## Error in grouped_df_impl(data, unname(vars), drop): Column `cluster` is unknown
nCols <- floor( sqrt(length(unique(top1$cluster))) )   
## Error in unique(top1$cluster): object 'top1' not found
figHeight <- nCols *7
## Error in eval(expr, envir, enclos): object 'nCols' not found
# Plot top 2 biomarker genes for each 
for (clust in unique(pbmc.markers$cluster)){ 
   cat('\n')   
   cat("### Cluster ",clust,"\n") 
   biomarkers <- getTopBiomarker(pbmc.markers, clusterID=clust, topN=2)
   plotBiomarkers(pbmc, biomarkers, clust)  
   cat('\n')   
} 

Top Biomarker Plot

Biomarkers tSNE

fp <- FeaturePlot(object = pbmc, features.plot = top1$gene, cols.use = c("grey", "purple"), 
    reduction.use = "tsne", nCol = nCols, do.return = T) 
## Error in FeaturePlot(object = pbmc, features.plot = top1$gene, cols.use = c("grey", : object 'nCols' not found

Biomarkers Heatmap

top5 <- pbmc.markers %>% group_by(cluster) %>% top_n(5, avg_logFC)
## Error in grouped_df_impl(data, unname(vars), drop): Column `cluster` is unknown
# setting slim.col.label to TRUE will print just the cluster IDS instead of
# every cell name
DoHeatmap(object = pbmc, genes.use = top5$gene, slim.col.label=T, remove.key=T) %>% ggplotly()
## Error in SetIfNull(x = genes.use, default = rownames(x = data.use)): object 'top5' not found

Biomarkers Ridgeplot

RidgePlot(pbmc, features.plot = top1$gene,  nCol = nCols, do.sort = F)
## Error in RidgePlot(pbmc, features.plot = top1$gene, nCol = nCols, do.sort = F): object 'nCols' not found

Map Clusters to Marker-Defined Cell Types

  • Known Monocytes Biomarkers
  • Classical: CD14++ / CD16–
  • Intermediate: CD14++ / CD16+
  • Nonclassical: CD14+ / CD16++ (not captured in this data)
markerList <- c("CD14", "FCGR3A") 

genes_by_cluster <- function(pbmc, markerList){
  marker.matrix <- pbmc@scale.data[row.names(pbmc@scale.data) %in% markerList, ]   
  markerMelt <- reshape2:::melt.matrix(marker.matrix)
  colnames(markerMelt) <- c("Gene", "Cell", "Expression")
  identData <- data.frame(pbmc@ident)  
  identData$Cell <- row.names(identData)
  colnames(identData) <- c("Cluster","Cell") 
  markerDF <- merge(markerMelt,  identData, by = "Cell")
 
  return(markerDF)
}
markerDF <- genes_by_cluster(pbmc, markerList)

Known Biomarkers: Heatmaps

Clusters Averaged

# Show mean exp for each marker
avgMarker <- markerDF %>% group_by(Gene, Cluster) %>% summarise(meanExp = mean(Expression)) 
ggplot(data = avgMarker, aes(x=Gene, y=Cluster, fill=meanExp)) %>% + geom_tile() %>% + scale_fill_distiller(palette="viridis") %>% ggplotly()
## Warning in pal_name(palette, type): Unknown palette viridis

Clusters Separated

markerMelt <- reshape2::acast(markerDF, Cell~Gene, value.var="Expression", fun.aggregate = mean, drop = F, fill = 0) 

#plot_ly(  z = markerMelt, y=row.names(markerMelt), z=colnames(markerMelt), type="heatmap")
# dx_colors <- colorRampPalette(brewer.pal(2, "RdBu"))
# mut_colors <- colorRampPalette(brewer.pal(length(unique(pbmc@meta.data$mut)), "Set3"))
Spectral <- colorRampPalette(brewer.pal(length(unique(pbmc@meta.data$mut)), "Spectral"))  
## Error in brewer.pal(length(unique(pbmc@meta.data$mut)), "Spectral"): could not find function "brewer.pal"
heatmaply(markerMelt,  key.title="Expression",#plot_method= "plotly",
          k_row = length(unique(pbmc.markers)), dendrogram = "row",
          showticklabels = c(T, F), xlab = "Known Markers", ylab = "Cells", column_text_angle = 0,
          row_side_colors =  pbmc@meta.data[,c("dx","mut")], row_side_palette = Spectral
          )  %>%  colorbar(tickfont = list(size = 12), titlefont = list(size = 14), which = 2)  %>% 
          colorbar(tickfont = list(size = 12), titlefont = list(size = 14), which = 1) 
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
##   Referenced from: /Library/Frameworks/R.framework/Resources/modules//R_X11.so
##   Reason: image not found
## Warning: Didn't find a colorbar to modify.

## Warning: Didn't find a colorbar to modify.

Known Biomarkers: Boxplot

ggplot(data = markerDF, aes(x=Cluster, y=Expression, fill=Gene)) %>% 
  + geom_boxplot(alpha=0.5) %>% + scale_fill_manual(values=c("purple", "turquoise")) # %>% ggplotly()  

Known Biomarkers: tSNE

expressionTSNE <- function(pbmc, marker, colors=c("grey", "red")){
  FeaturePlot(object = pbmc, features.plot = marker, cols.use = colors, 
    reduction.use = "tsne", nCol=2, do.return = T, dark.theme = T)[[1]] %>% ggplotly()
}
 
 subplot(expressionTSNE(pbmc, markerList[1]), 
        expressionTSNE(pbmc, markerList[2],  colors=c("grey", "green")))

Label Clusters by Known Biomarker

current.cluster.ids <- unique(pbmc.markers$cluster) #c(0, 1, 2, 3, 4, 5, 6, 7)
top1 <- pbmc.markers %>% group_by(cluster) %>% top_n(1, avg_logFC)
## Error in grouped_df_impl(data, unname(vars), drop): Column `cluster` is unknown
new.cluster.ids <- top1$gene #c("CD4 T cells", "CD14+ Monocytes", "B cells", "CD8 T cells", "FCGR3A+ Monocytes", "NK cells", "Dendritic cells", "Megakaryocytes")
## Error in eval(expr, envir, enclos): object 'top1' not found
pbmc@ident <- plyr::mapvalues(x = pbmc@ident, from = current.cluster.ids, to = new.cluster.ids)
## Error in plyr::mapvalues(x = pbmc@ident, from = current.cluster.ids, to = new.cluster.ids): object 'new.cluster.ids' not found
TSNEPlot(object=pbmc, do.label=T, pt.size=0.5, do.return=T) %>% ggplotly()

Cell Sub-clusters

Further subdivisions within cell types.
If you perturb some of our parameter choices above (for example, setting resolution=0.8 or changing the number of PCs), you might see the CD4 T cells subdivide into two groups. You can explore this subdivision to find markers separating the two T cell subsets. However, before reclustering (which will overwrite object@ident), we can stash our renamed identities to be easily recovered later.

Assign Identity

# First lets stash our identities for later
pbmc <- StashIdent(object = pbmc, save.name = "ClusterNames_0.6")

# Note that if you set save.snn=T above, you don't need to recalculate the
# SNN, and can simply put: pbmc <- FindClusters(pbmc,resolution = 0.8)
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10, 
    resolution = 0.8, print.output = FALSE)

## Warning in BuildSNN(object = object, genes.use = genes.use, reduction.type
## = reduction.type, : Build parameters exactly match those of already
## computed and stored SNN. To force recalculation, set force.recalc to TRUE.

# Demonstration of how to plot two tSNE plots side by side, and how to color
# points based on different criteria
plot1 <- TSNEPlot(object = pbmc, do.return = TRUE, no.legend = TRUE, do.label = TRUE, label.size=labSize)
plot2 <- TSNEPlot(object = pbmc, do.return = TRUE, group.by = "ClusterNames_0.6", 
                  no.legend = TRUE, do.label = TRUE, label.size=labSize)
plot_grid(plot1, plot2)

Find Markers

# Find discriminating markers
tcell.markers <- FindMarkers(object = pbmc, ident.1 = 0, ident.2 = 1)

# Most of the markers tend to be expressed in C1 (i.e. S100A4). However, we
# can see that CCR7 is upregulated in C0, strongly indicating that we can
# differentiate memory from naive CD4 cells.  cols.use demarcates the color
# palette from low to high expression
FeaturePlot(object = pbmc, features.plot = top1$gene, cols.use = c("green", "blue"))
## Error in FeaturePlot(object = pbmc, features.plot = top1$gene, cols.use = c("green", : object 'top1' not found
pbmc <- SetAllIdent(object = pbmc, id = "ClusterNames_0.6") 
# Save results for EACH run (in their respective subfolders)
saveRDS(pbmc, file=file.path(params$resultsPath, "cd14-processed.rds") )